- Notifications
You must be signed in to change notification settings - Fork 849
/
Copy pathOML4Py Data Transformation One Hot Encoding.dsnb
executable file
·1 lines (1 loc) · 7.7 KB
/
OML4Py Data Transformation One Hot Encoding.dsnb
1
[{"layout":null,"template":null,"templateConfig":null,"name":"OML4Py Data Transformation One Hot Encoding","description":null,"readOnly":false,"type":"low","paragraphs":[{"col":0,"visualizationConfig":null,"hideInIFrame":false,"selectedVisualization":null,"title":null,"message":["%md"," "],"enabled":true,"result":null,"sizeX":0,"hideCode":true,"width":12,"hideResult":true,"dynamicFormParams":null,"row":0,"hasTitle":false,"hideVizConfig":true,"hideGutter":true,"relations":[],"forms":"[]"},{"col":0,"visualizationConfig":null,"hideInIFrame":false,"selectedVisualization":"html","title":null,"message":["%md","","# OML4Py Data Transformation: One Hot Encoding","In this notebook, we demonstrate how to do one hot encoding using OML4Py.","","We use the customer insurance lifetime value data set which contains customer financial information, lifetime value, and whether or not the customer bought insurance.","","The dataset `CUSTOMER_INSURANCE_LTV` is generated by the `\"OML Run-me-first\"` notebook, which `MUST` be run before this notebook.","","###### IMPORTANT: The `\"OML Run-me-first\"` notebook is available under the menu `Templates -> Examples` and is a pre-requisite to the current notebook.","","**Note**: If the user plans to use the in-database algorithms, one hot encoding is automatically applied for those algorithms requiring it. The in-databae algorithms automatically explode the categorical columns and fit the model on the prepared data internally. For more details, see <a href=\"https://docs.oracle.com/en/database/oracle/machine-learning/oml4sql/21/dmcon/generalized-linear-model.html#GUID-19B8E133-0029-4892-88BB-3E1C9E83EB12\" onclick=\"return ! window.open('https://docs.oracle.com/en/database/oracle/machine-learning/oml4sql/21/dmcon/generalized-linear-model.html#GUID-19B8E133-0029-4892-88BB-3E1C9E83EB12');\"> Data Preparation for GLM <\/a>","","Copyright (c) 2024 Oracle Corporation ","###### <a href=\"https://oss.oracle.com/licenses/upl/\" onclick=\"return ! window.open('https://oss.oracle.com/licenses/upl/');\">The Universal Permissive License (UPL), Version 1.0<\/a>"],"enabled":true,"result":null,"sizeX":0,"hideCode":true,"width":12,"hideResult":false,"dynamicFormParams":null,"row":0,"hasTitle":false,"hideVizConfig":true,"hideGutter":true,"relations":[],"forms":"[]"},{"col":0,"visualizationConfig":null,"hideInIFrame":false,"selectedVisualization":"html","title":"For more information ...","message":["%md ","","* <a href=\"https://docs.oracle.com/en/cloud/paas/autonomous-data-warehouse-cloud/index.html\" target=\"_blank\">Oracle ADW Documentation<\/a>","* <a href=\"https://github.com/oracle/oracle-db-examples/tree/master/machine-learning\" target=\"_blank\">OML folder on Oracle GitHub<\/a>","* <a href=\"https://www.oracle.com/machine-learning\" target=\"_blank\">OML Web Page<\/a>","* <a href=\"https://docs.oracle.com/en/database/oracle/machine-learning/oml4py/2/mlpug/cross-tabulate-data.html\" target=\"_blank\">OML4Py Cross-Tabulation<\/a>","* <a href=\"https://github.com/oracle-samples/oracle-db-examples/tree/main/machine-learning\" target=\"_blank\">OML Folder on Oracle Github<\/a>"],"enabled":true,"result":null,"sizeX":0,"hideCode":true,"width":12,"hideResult":false,"dynamicFormParams":null,"row":0,"hasTitle":true,"hideVizConfig":false,"hideGutter":true,"relations":[],"forms":"[]"},{"col":0,"visualizationConfig":null,"hideInIFrame":false,"selectedVisualization":"html","title":"Import libraries ","message":["%python","","import pandas as pd","import oml"],"enabled":true,"result":null,"sizeX":0,"hideCode":false,"width":12,"hideResult":false,"dynamicFormParams":null,"row":0,"hasTitle":true,"hideVizConfig":false,"hideGutter":true,"relations":[],"forms":"[]"},{"col":0,"visualizationConfig":null,"hideInIFrame":false,"selectedVisualization":"table","title":"Get a proxy object to CUSTOMER_INSURANCE_LTV and subset of columns","message":["%python","","CUST_DF = oml.sync(table = 'CUSTOMER_INSURANCE_LTV')","cat_cols = ['MARITAL_STATUS', 'STATE', 'GENDER', 'REGION']","CUST_CAT_DF = CUST_DF[['CUSTOMER_ID'] + cat_cols]","z.show(CUST_CAT_DF.head())"],"enabled":true,"result":null,"sizeX":0,"hideCode":false,"width":12,"hideResult":false,"dynamicFormParams":null,"row":0,"hasTitle":true,"hideVizConfig":false,"hideGutter":true,"relations":[],"forms":"[]"},{"col":0,"visualizationConfig":null,"hideInIFrame":false,"selectedVisualization":"html","title":"Create function for one hot encoding","message":["%python","","def encode(DF, ids, col):"," assert isinstance(ids, list)"," CNT_DF = DF.crosstab(ids, col)"," PIVOT_DF = CNT_DF.pivot_table(ids, col, 'count', aggfunc = oml.DataFrame.count)"," cols = PIVOT_DF.columns[len(ids):]"," new_columns = [ col + '_' + c.split('_')[1][1:-1] for c in cols]"," new_columns = ids + new_columns"," PIVOT_DF.rename(columns = new_columns)"," return PIVOT_DF"," "],"enabled":true,"result":null,"sizeX":0,"hideCode":false,"width":12,"hideResult":false,"dynamicFormParams":null,"row":0,"hasTitle":true,"hideVizConfig":false,"hideGutter":true,"relations":[],"forms":"[]"},{"col":0,"visualizationConfig":null,"hideInIFrame":false,"selectedVisualization":"table","title":"Apply one hot encoding to MARITAL_STATUS column and check the transformed data","message":["%python","","ONEHOT_DF = encode(CUST_CAT_DF, ['CUSTOMER_ID'], 'MARITAL_STATUS') ","z.show(ONEHOT_DF.head())"," "],"enabled":true,"result":null,"sizeX":0,"hideCode":false,"width":12,"hideResult":false,"dynamicFormParams":null,"row":0,"hasTitle":true,"hideVizConfig":false,"hideGutter":true,"relations":[],"forms":"[]"},{"col":0,"visualizationConfig":null,"hideInIFrame":false,"selectedVisualization":"table","title":"Apply one hot encoding to all selected categorical columns","message":["%python","","RES_DF = CUST_CAT_DF[['CUSTOMER_ID']]","for i, col in enumerate(cat_cols):"," if col == 'CUSTOMER_ID':"," continue"," ONEHOT_DF = encode(CUST_CAT_DF, ['CUSTOMER_ID'], col)"," RES_DF = RES_DF.merge(ONEHOT_DF, on ='CUSTOMER_ID', how = 'inner', suffixes = ['', ''])"," ","z.show(RES_DF.head())"],"enabled":true,"result":null,"sizeX":0,"hideCode":false,"width":12,"hideResult":false,"dynamicFormParams":null,"row":0,"hasTitle":true,"hideVizConfig":false,"hideGutter":true,"relations":[],"forms":"[]"},{"col":0,"visualizationConfig":null,"hideInIFrame":false,"selectedVisualization":"html","title":"Replace None and nan values with 0","message":["%python","","cols = RES_DF.columns[1:]","for col in cols:"," RES_DF = RES_DF.replace(old = [None], new = [0.0], default = 1.0, columns = [col])"],"enabled":true,"result":null,"sizeX":0,"hideCode":false,"width":12,"hideResult":false,"dynamicFormParams":null,"row":0,"hasTitle":true,"hideVizConfig":false,"hideGutter":true,"relations":[],"forms":"[]"},{"col":0,"visualizationConfig":null,"hideInIFrame":false,"selectedVisualization":"table","title":"Check the result","message":["%python","","z.show(RES_DF.head())"],"enabled":true,"result":null,"sizeX":0,"hideCode":false,"width":12,"hideResult":false,"dynamicFormParams":null,"row":0,"hasTitle":true,"hideVizConfig":false,"hideGutter":true,"relations":[],"forms":"[]"},{"col":0,"visualizationConfig":null,"hideInIFrame":false,"selectedVisualization":"html","title":null,"message":["%md","","## End of Script"],"enabled":true,"result":null,"sizeX":0,"hideCode":true,"width":12,"hideResult":false,"dynamicFormParams":null,"row":0,"hasTitle":false,"hideVizConfig":true,"hideGutter":true,"relations":[],"forms":"[]"},{"col":0,"visualizationConfig":null,"hideInIFrame":false,"selectedVisualization":"html","title":null,"message":["%md"],"enabled":true,"result":null,"sizeX":0,"hideCode":true,"width":12,"hideResult":true,"dynamicFormParams":null,"row":0,"hasTitle":false,"hideVizConfig":true,"hideGutter":true,"relations":[],"forms":"[]"}],"version":"6","snapshot":false,"tags":null}]